This document summarizes Rick Gilmore’s analysis of the Jaccard index data.
The goal is to explore ways to generate an empirical “null” distribution of the Jaccard index data to compare it to the observed data.
Note: I have set eval=FALSE for a series of chunks below that generate the permuted sorting and Jaccard index data. These take many minutes to run.
/analysis/data/{P1,P31M,P3M1,P6,P6M}-sorting.csv and create new CSVs with the permuted values.analysis/make.jaccard.df.R function.n times, where n is large, probably 1,000.Let’s build and test a permutation function for the raw sorting data.
Now, let’s generate multiple permuted CSVs.
generate_n_sorting_permutations <-
function(wp_group = "P1",
n_permutations = 5) {
csv_in <- paste0("analysis/data/", wp_group, "-sorting.csv")
if (!file.exists(csv_in)) {
stop(paste0("`csv_in` not found: ", csv_in))
}
df_in <- readr::read_csv(csv_in)
df_exemplars <- df_in[, -c(1, 2, 23, 24)]
out_m <- as.matrix(df_exemplars)
for (p in 1:n_permutations) {
csv_out <-
paste0(
"analysis/data/permutation_analysis/sorting_csv/",
wp_group,
"-sorting-perm-",
stringr::str_pad(p, 3, pad = 0), ".csv"
)
for (r in 1:dim(out_m)[1]) {
new_i <- sample(1:20)
out_m[r, 1:20] <- out_m[r, new_i]
}
array_out <-
as.data.frame(cbind(df_in$Participant, df_in$Set, out_m, df_in$Set_size, df_in$Group))
# Rename!
names(array_out) <-
c("Participant",
"Set",
names(df_exemplars),
"Set_size",
"Group")
array_out
readr::write_csv(array_out, csv_out)
}
}
Then we test it.
generate_n_sorting_permutations()
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## .default = col_double(),
## Participant = col_character(),
## Group = col_character()
## )
## ℹ Use `spec()` for the full column specifications.
Now, let’s confirm that we can calculate Jaccard indices from these data.
make_jaccard_csvs <- function(wallpaper_group = "P1",
duplicates = FALSE,
input_dir = 'analysis/data/permutation_analysis/sorting_csv/',
output_dir = 'analysis/data/permutation_analysis/jaccards/') {
# Makes a data.frame from the raw sorting data
# Load externals
source("analysis/jaccard.data.R")
source("analysis/jaccard.R")
these_csvs <-
list.files(input_dir, paste0("^", wallpaper_group, "\\-"), full.names = TRUE)
purrr::map(these_csvs,
calculate_save_jaccard_df,
wallpaper_group,
jaccard_dir = output_dir)
}
# Load a sorting permutation file, calculate the Jaccard indices, and (conditionally) save it to file.
calculate_save_jaccard_df <- function(this_csv,
wallpaper_group,
save_output = TRUE,
jaccard_dir = "analysis/data/permutation_analysis/jaccards/",
vb = FALSE) {
this_fn <- basename(this_csv)
this_perm_number <- stringr::str_extract(this_fn, "[0-9]{3}")
out_fn <-
paste0(jaccard_dir,
wallpaper_group,
"-jaccard-",
this_perm_number,
".csv")
this_df <- readr::read_csv(this_csv)
# Calculate Jaccard
jaccard_df <- jaccard.data(this_df)
if (save_output) {
if (vb) message(paste0('Saving ', out_fn))
readr::write_csv(jaccard_df, out_fn)
} else {
jaccard_df
}
}
make_jaccard_csvs()
generate_n_sorting_permutations("P1", n_permutations = 999)
make_jaccard_csvs("P1")
generate_n_sorting_permutations("P31M", n_permutations = 999)
make_jaccard_csvs("P31M")
generate_n_sorting_permutations("P3M1", n_permutations = 999)
make_jaccard_csvs("P3M1")
generate_n_sorting_permutations("P6", n_permutations = 999)
make_jaccard_csvs("P6")
generate_n_sorting_permutations("P6M", n_permutations = 999)
make_jaccard_csvs("P6M")
make_perm_jaccard_df <- function(this_csv) {
this_fn <- basename(this_csv)
this_perm_number <- stringr::str_extract(this_fn, "[0-9]{3}")
this_df <- readr::read_csv(this_csv)
this_df <- this_df %>%
dplyr::mutate(
.,
exemplar_pair = paste0(
stringr::str_extract(Exemplar.Row, "[0-9]{3}$"),
"-",
stringr::str_extract(Exemplar.Col, "[0-9]{3}$")
),
perm = this_perm_number
)
this_df
}
make_aggregate_perm_jaccard_df <- function(wp_group = "P1",
input_dir = "analysis/data/permutation_analysis/jaccards",
save_csv = TRUE,
output_dir = "analysis/data/permutation_analysis/aggregates",
vb = TRUE) {
these_csvs <-
list.files(input_dir, paste0("^", wp_group, "\\-"), full.names = TRUE)
df <- purrr::map_df(these_csvs, make_perm_jaccard_df)
if (save_csv) {
out_fn <- file.path(output_dir, paste0(wp_group, "-aggregate-perm-jaccard.csv"))
if (vb) message(paste0("Saving ", out_fn))
readr::write_csv(df, out_fn)
} else {
df
}
}
Import the data.
P1_perm_df <- make_aggregate_perm_jaccard_df("P1")
P1_perm_df <- readr::read_csv("analysis/data/permutation_analysis/aggregates/P1-aggregate-perm-jaccard.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## Exemplar.Row = col_double(),
## Exemplar.Col = col_double(),
## Jaccard = col_double(),
## Group = col_character(),
## exemplar_pair = col_character(),
## perm = col_character()
## )
Visualize.
P1_perm_df %>%
ggplot2::ggplot(.) +
ggplot2::aes(x = Jaccard) +
ggplot2::geom_histogram(bins = 50)
Generate summary stats by exemplar pair.
P1_perm_stats_df <- P1_perm_df %>%
dplyr::group_by(., Group, exemplar_pair) %>%
dplyr::summarize(., jaccard_mean = mean(Jaccard),
jaccard_sd = sd(Jaccard))
## `summarise()` has grouped output by 'Group'. You can override using the `.groups` argument.
Import the data.
P31M_perm_df <- make_aggregate_perm_jaccard_df("P31M")
P31M_perm_df <- readr::read_csv("analysis/data/permutation_analysis/aggregates/P31M-aggregate-perm-jaccard.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## Exemplar.Row = col_double(),
## Exemplar.Col = col_double(),
## Jaccard = col_double(),
## Group = col_character(),
## exemplar_pair = col_character(),
## perm = col_character()
## )
Visualize.
P31M_perm_df %>%
ggplot2::ggplot(.) +
ggplot2::aes(x = Jaccard) +
ggplot2::geom_histogram(bins = 50)
Generate summary stats by exemplar pair.
P31M_perm_stats_df <- P31M_perm_df %>%
dplyr::group_by(., Group, exemplar_pair) %>%
dplyr::summarize(., jaccard_mean = mean(Jaccard),
jaccard_sd = sd(Jaccard))
## `summarise()` has grouped output by 'Group'. You can override using the `.groups` argument.
Import the data.
P3M1_perm_df <- make_aggregate_perm_jaccard_df("P3M1")
P3M1_perm_df <- readr::read_csv("analysis/data/permutation_analysis/aggregates/P3M1-aggregate-perm-jaccard.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## Exemplar.Row = col_double(),
## Exemplar.Col = col_double(),
## Jaccard = col_double(),
## Group = col_character(),
## exemplar_pair = col_character(),
## perm = col_character()
## )
Visualize.
P3M1_perm_df %>%
ggplot2::ggplot(.) +
ggplot2::aes(x = Jaccard) +
ggplot2::geom_histogram(bins = 50)
Generate summary stats by exemplar pair.
P3M1_perm_stats_df <- P3M1_perm_df %>%
dplyr::group_by(., Group, exemplar_pair) %>%
dplyr::summarize(., jaccard_mean = mean(Jaccard),
jaccard_sd = sd(Jaccard))
## `summarise()` has grouped output by 'Group'. You can override using the `.groups` argument.
Import the data.
P6_perm_df <- make_aggregate_perm_jaccard_df("P6")
P6_perm_df <- readr::read_csv("analysis/data/permutation_analysis/aggregates/P6-aggregate-perm-jaccard.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## Exemplar.Row = col_double(),
## Exemplar.Col = col_double(),
## Jaccard = col_double(),
## Group = col_character(),
## exemplar_pair = col_character(),
## perm = col_character()
## )
Visualize.
P6_perm_df %>%
ggplot2::ggplot(.) +
ggplot2::aes(x = Jaccard) +
ggplot2::geom_histogram(bins = 50)
Generate summary stats by exemplar pair.
P6_perm_stats_df <- P6_perm_df %>%
dplyr::group_by(., Group, exemplar_pair) %>%
dplyr::summarize(., jaccard_mean = mean(Jaccard),
jaccard_sd = sd(Jaccard))
## `summarise()` has grouped output by 'Group'. You can override using the `.groups` argument.
Import the data.
P6M_perm_df <- make_aggregate_perm_jaccard_df("P6M")
P6M_perm_df <- readr::read_csv("analysis/data/permutation_analysis/aggregates/P6M-aggregate-perm-jaccard.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## Exemplar.Row = col_double(),
## Exemplar.Col = col_double(),
## Jaccard = col_double(),
## Group = col_character(),
## exemplar_pair = col_character(),
## perm = col_character()
## )
Visualize.
P6M_perm_df %>%
ggplot2::ggplot(.) +
ggplot2::aes(x = Jaccard) +
ggplot2::geom_histogram(bins = 50)
Generate summary stats by exemplar pair.
P6M_perm_stats_df <- P6M_perm_df %>%
dplyr::group_by(., Group, exemplar_pair) %>%
dplyr::summarize(., jaccard_mean = mean(Jaccard),
jaccard_sd = sd(Jaccard))
## `summarise()` has grouped output by 'Group'. You can override using the `.groups` argument.
jaccard_perm_df <- rbind(P1_perm_df, P31M_perm_df, P3M1_perm_df, P6_perm_df, P6M_perm_df)
Visualization.
jaccard_perm_df %>%
ggplot(.) +
aes(Jaccard, color = Group) +
facet_grid(Group ~ .) +
geom_boxplot(bins = 50)
## Warning: Ignoring unknown parameters: bins
jaccard_perm_df %>%
ggplot(.) +
aes(Jaccard, color = Group) +
facet_grid(Group ~ .) +
geom_boxplot(bins = 50)
## Warning: Ignoring unknown parameters: bins
jaccard_perm_df %>%
ggplot(.) +
aes(x = Group, y = Jaccard) +
geom_violin()
These plots show that the mean differences in Jaccard indices are reflected in the participants’ data are shown in the permuted data, too. This makes sense since the participants detected regularities and sorted the exemplars into different numbers of sets. In permuting the exemplar identifiers within participants, we keep some of this structure.
Let’s try aggregating the by-exemplar statistics.
jaccard_perm_stats_df <- rbind(P1_perm_stats_df, P31M_perm_stats_df, P3M1_perm_stats_df, P6_perm_stats_df, P6M_perm_stats_df)
# Sort by group, exemplar_pair
jaccard_perm_stats_df <- jaccard_perm_stats_df %>%
dplyr::arrange(Group, exemplar_pair)
jaccard_perm_stats_df %>%
ggplot(.) +
aes(x = jaccard_mean, fill = Group) +
geom_histogram() +
facet_grid(Group ~ .) +
ggtitle("Mean exemplar-pair Jaccard indices for permuted data")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
jaccard_perm_stats_df %>%
ggplot(.) +
aes(x = jaccard_sd, fill = Group) +
geom_histogram() +
facet_grid(Group ~ .) +
ggtitle("Standard deviation of exemplar-pair Jaccard indices for permuted data")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Load the observed data and clean it.
jaccard_observed_df <-
readr::read_csv("analysis/data/jaccard-no-duplicates.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## Exemplar.Row = col_double(),
## Exemplar.Col = col_double(),
## Jaccard = col_double(),
## Group = col_character()
## )
jaccard_observed_df <- jaccard_observed_df %>%
dplyr::mutate(.,
exemplar_pair = paste0(
stringr::str_extract(Exemplar.Row, "[0-9]{3}$"),
"-",
stringr::str_extract(Exemplar.Col, "[0-9]{3}$")
)) %>%
dplyr::arrange(., Group, exemplar_pair)
Now, merge the permuted data with the observed data.
jaccard_merged_df <- dplyr::left_join(jaccard_perm_stats_df,
jaccard_observed_df,
by = c("Group", "exemplar_pair"))
# Rearrange columns for convenience
jaccard_merged_df <- jaccard_merged_df %>%
dplyr::select(., Group, exemplar_pair, Jaccard, jaccard_mean, jaccard_sd, Exemplar.Row, Exemplar.Col)
# Rename variables for clarity
jaccard_merged_df <- jaccard_merged_df %>%
dplyr::rename(., group = Group, jaccard_obs = Jaccard,
jaccard_emp_mean = jaccard_mean,
jaccard_emp_sd = jaccard_sd,
exemplar_row = Exemplar.Row,
exemplar_col = Exemplar.Col)
Calculate empirical z as \(z_{emp}=J_{obs}-\mu_{J}\) for each exemplar pair.
jaccard_merged_df <- jaccard_merged_df %>%
dplyr::mutate(., z_emp = (jaccard_obs-jaccard_emp_mean)/jaccard_emp_sd,
p_z_emp = pnorm(z_emp, jaccard_emp_mean, jaccard_emp_sd, lower.tail = FALSE))
Plot a histogram of z_emp.
jaccard_merged_df %>%
ggplot(.) +
aes(z_emp, fill = group) +
geom_histogram() +
facet_grid(group ~ .)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Curiously, P31M, P6, P6M and P3M1 have exemplar pairs whose observed Jaccard indices are substantially larger than the empirically derived reference (null) distribution even though the mean Jaccard indices for P1 are the largest.
Just for fun, let’s print the exemplar pairs whose z_emp exceed 3.7190165` (\(p<.0001\)).
jaccard_merged_df %>%
dplyr::filter(., z_emp > 4) %>%
dplyr::arrange(., group, desc(jaccard_emp_mean)) %>%
knitr::kable(.)
| group | exemplar_pair | jaccard_obs | jaccard_emp_mean | jaccard_emp_sd | exemplar_row | exemplar_col | z_emp | p_z_emp |
|---|---|---|---|---|---|---|---|---|
| P1 | 008-009 | 0.4347826 | 0.1997302 | 0.0523893 | 101008 | 101009 | 4.486650 | 0 |
| P31M | 006-016 | 0.3469388 | 0.1544913 | 0.0455907 | 115006 | 115016 | 4.221202 | 0 |
| P31M | 002-020 | 0.4347826 | 0.1541413 | 0.0445330 | 115002 | 115020 | 6.301875 | 0 |
| P31M | 004-009 | 0.4666667 | 0.1536301 | 0.0458028 | 115004 | 115009 | 6.834437 | 0 |
| P31M | 008-015 | 0.3469388 | 0.1534583 | 0.0454080 | 115008 | 115015 | 4.260934 | 0 |
| P31M | 014-020 | 0.5000000 | 0.1532609 | 0.0443117 | 115014 | 115020 | 7.824999 | 0 |
| P31M | 007-020 | 0.3469388 | 0.1530137 | 0.0472758 | 115007 | 115020 | 4.101998 | 0 |
| P31M | 002-007 | 0.6500000 | 0.1528091 | 0.0456546 | 115002 | 115007 | 10.890260 | 0 |
| P31M | 002-014 | 0.4347826 | 0.1523665 | 0.0464296 | 115002 | 115014 | 6.082672 | 0 |
| P31M | 007-014 | 0.3469388 | 0.1520057 | 0.0465605 | 115007 | 115014 | 4.186665 | 0 |
| P31M | 009-014 | 0.3469388 | 0.1501004 | 0.0442869 | 115009 | 115014 | 4.444622 | 0 |
| P3M1 | 019-020 | 0.4042553 | 0.1436215 | 0.0475534 | 114019 | 114020 | 5.480869 | 0 |
| P3M1 | 003-016 | 0.3469388 | 0.1431156 | 0.0447568 | 114003 | 114016 | 4.554010 | 0 |
| P3M1 | 011-019 | 0.3750000 | 0.1415372 | 0.0445837 | 114011 | 114019 | 5.236508 | 0 |
| P6 | 001-017 | 0.3200000 | 0.1384481 | 0.0436607 | 116001 | 116017 | 4.158245 | 0 |
| P6 | 007-009 | 0.4666667 | 0.1380211 | 0.0441697 | 116007 | 116009 | 7.440517 | 0 |
| P6 | 014-017 | 0.3200000 | 0.1376545 | 0.0451294 | 116014 | 116017 | 4.040509 | 0 |
| P6 | 013-019 | 0.4888889 | 0.1373315 | 0.0433875 | 116013 | 116019 | 8.102725 | 0 |
| P6 | 005-013 | 0.3333333 | 0.1365565 | 0.0442138 | 116005 | 116013 | 4.450579 | 0 |
| P6 | 002-011 | 0.4042553 | 0.1363300 | 0.0431172 | 116002 | 116011 | 6.213881 | 0 |
| P6 | 008-009 | 0.3541667 | 0.1361674 | 0.0434391 | 116008 | 116009 | 5.018501 | 0 |
| P6 | 008-020 | 0.3265306 | 0.1359853 | 0.0425070 | 116008 | 116020 | 4.482680 | 0 |
| P6 | 006-019 | 0.4666667 | 0.1356848 | 0.0441123 | 116006 | 116019 | 7.503171 | 0 |
| P6 | 003-014 | 0.3265306 | 0.1354559 | 0.0438507 | 116003 | 116014 | 4.357394 | 0 |
| P6 | 006-013 | 0.5581395 | 0.1348380 | 0.0450430 | 116006 | 116013 | 9.397711 | 0 |
| P6M | 008-010 | 0.3333333 | 0.1448769 | 0.0455945 | 117008 | 117010 | 4.133318 | 0 |
| P6M | 008-020 | 0.3541667 | 0.1437420 | 0.0410528 | 117008 | 117020 | 5.125709 | 0 |
| P6M | 001-011 | 0.3333333 | 0.1433422 | 0.0465964 | 117001 | 117011 | 4.077378 | 0 |
| P6M | 010-020 | 0.3829787 | 0.1432528 | 0.0453747 | 117010 | 117020 | 5.283256 | 0 |
Plot a histogram of p_z_emp or the probability associated with the empirical z.
jaccard_merged_df %>%
ggplot(.) +
aes(p_z_emp, fill = group) +
geom_histogram() +
facet_grid(group ~ .)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Create function to extract data and convert to matrix.
extract_exemplar_index <- function(e) {
stringr::str_extract(e, '[0-9]{2}$')
}
extract_exemplar_emp_z_matrix <- function(wp_group = "P1", df = jaccard_merged_df) {
this_df <- df %>%
dplyr::filter(., group == wp_group)
this_matrix <- matrix(nrow = 20, ncol = 20)
for (r in 1:190) {
this_matrix[as.numeric(extract_exemplar_index(this_df$exemplar_row[r])),
as.numeric(extract_exemplar_index(this_df$exemplar_col[r]))] <-
this_df$z_emp[r]
}
this_matrix
}
wp_emp_mean <- function(wp_group = "P1", df = jaccard_merged_df) {
this_df <- df %>%
dplyr::filter(., group == wp_group)
mean(this_df$jaccard_emp_mean)
}
wp_emp_sd <- function(wp_group = "P1", df = jaccard_merged_df) {
this_df <- df %>%
dplyr::filter(., group == wp_group)
sd(this_df$jaccard_emp_mean)
}
Test the functions.
wp_emp_mean()
## [1] 0.2008955
wp_emp_sd()
## [1] 0.001735995
extract_exemplar_emp_z_matrix()
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,] NA 1.278546 -0.4063873 -1.5587622 0.4846571 0.04668384 -0.86372090
## [2,] NA NA -0.4582298 -0.4274744 -0.4186175 1.79035535 -0.84568146
## [3,] NA NA NA 0.8835449 -0.3725606 -0.40307115 1.26660435
## [4,] NA NA NA NA 1.3404188 2.26643744 -0.04013945
## [5,] NA NA NA NA NA 0.47331811 0.88191730
## [6,] NA NA NA NA NA NA 0.85560584
## [7,] NA NA NA NA NA NA NA
## [8,] NA NA NA NA NA NA NA
## [9,] NA NA NA NA NA NA NA
## [10,] NA NA NA NA NA NA NA
## [11,] NA NA NA NA NA NA NA
## [12,] NA NA NA NA NA NA NA
## [13,] NA NA NA NA NA NA NA
## [14,] NA NA NA NA NA NA NA
## [15,] NA NA NA NA NA NA NA
## [16,] NA NA NA NA NA NA NA
## [17,] NA NA NA NA NA NA NA
## [18,] NA NA NA NA NA NA NA
## [19,] NA NA NA NA NA NA NA
## [20,] NA NA NA NA NA NA NA
## [,8] [,9] [,10] [,11] [,12] [,13]
## [1,] -0.4170658 0.91998657 0.39258792 0.08265938 -0.8205435 0.04047028
## [2,] 0.3820315 0.90692441 -0.88589284 2.43642158 -2.3758630 -1.29048260
## [3,] 1.3493598 0.44475779 -0.42231484 1.48514030 1.3501212 -1.24717244
## [4,] 1.2670747 0.91684340 -0.06783505 1.95939390 -0.0363671 -0.48506776
## [5,] -0.8272640 -0.01028528 -0.39954466 0.53513838 0.9527189 -0.03654542
## [6,] -0.8246242 -1.18513659 -1.55311620 0.52395716 -2.3053761 0.02056386
## [7,] -1.9834361 -0.81164814 1.72676587 -1.52011445 2.1843667 -1.19457034
## [8,] NA 4.48664976 -1.57559245 0.96491370 -0.3760906 -0.82638977
## [9,] NA NA 0.46071891 1.01567374 -0.8051875 -0.44645379
## [10,] NA NA NA -0.72392426 0.4382522 -1.23232410
## [11,] NA NA NA NA -1.9134128 0.47737007
## [12,] NA NA NA NA NA 1.26378565
## [13,] NA NA NA NA NA NA
## [14,] NA NA NA NA NA NA
## [15,] NA NA NA NA NA NA
## [16,] NA NA NA NA NA NA
## [17,] NA NA NA NA NA NA
## [18,] NA NA NA NA NA NA
## [19,] NA NA NA NA NA NA
## [20,] NA NA NA NA NA NA
## [,14] [,15] [,16] [,17] [,18]
## [1,] -8.450266e-01 2.25346232 0.46654942 1.266958620 -0.45569165
## [2,] -7.454175e-02 -0.42818102 -0.03996971 -0.843992742 -0.45993890
## [3,] -2.353633e-02 2.17214667 -0.40745546 1.805807998 0.01955637
## [4,] -8.772070e-01 -0.84589851 -1.28706043 -0.005957962 -0.04586250
## [5,] -1.090300e-02 1.24828571 0.86896324 -1.551022672 0.94420009
## [6,] -1.185326e+00 -0.06913131 -1.16833936 -0.444435357 1.23212863
## [7,] -4.587557e-01 2.28144774 1.77045809 1.769055737 0.36569635
## [8,] -5.251895e-05 -2.00041056 -1.86882703 -0.448893957 0.41133069
## [9,] 5.565490e-03 -0.40475349 -1.16208327 1.309903275 0.03547131
## [10,] -4.538015e-01 2.74519701 3.88341611 0.407380715 -2.30123847
## [11,] 4.206246e-02 -0.35592516 -1.50125876 0.472141256 -0.35533988
## [12,] 8.320571e-01 1.28992131 -0.02461837 -0.836023062 -1.22388245
## [13,] 4.032674e-01 -0.41612144 -1.20419690 -1.183330632 1.77401400
## [14,] NA -0.83358345 -0.05385836 -0.841892714 -0.43985972
## [15,] NA NA 1.80969188 0.894490657 -0.47778554
## [16,] NA NA NA 2.282295285 -2.38702604
## [17,] NA NA NA NA -2.23676330
## [18,] NA NA NA NA NA
## [19,] NA NA NA NA NA
## [20,] NA NA NA NA NA
## [,19] [,20]
## [1,] 0.434817431 -0.85052280
## [2,] -0.407690961 -0.47929098
## [3,] -0.387107712 -0.84816357
## [4,] -0.806827174 0.35969809
## [5,] -0.773074742 -0.37698353
## [6,] -0.437145373 -0.37783743
## [7,] 3.476615059 0.78358220
## [8,] -1.289939900 -1.92388138
## [9,] -0.405258726 -1.53542408
## [10,] 1.320221281 2.86840721
## [11,] -2.295820616 -0.36815069
## [12,] 1.870930810 0.82816252
## [13,] -0.421732928 -0.44874426
## [14,] 1.277881102 -0.05811713
## [15,] 0.862228210 -0.43978299
## [16,] -0.006930371 3.36936846
## [17,] -0.002987090 1.32621659
## [18,] -1.854978934 -2.01838497
## [19,] NA 2.29590443
## [20,] NA NA
Now, let’s turn to the heatmap.
# Based on stats::heatmap(), but with different defaults for axis labels
my_heatmap <- function (x, Rowv = NULL, Colv = if (symm) "Rowv" else NULL,
distfun = dist, hclustfun = hclust, reorderfun = function(d,
w) reorder(d, w), add.expr, symm = FALSE, revC = identical(Colv,
"Rowv"), scale = c("row", "column", "none"), na.rm = TRUE,
margins = c(5, 5), ColSideColors, RowSideColors, cexRow = 0.2 +
1/log10(nr), cexCol = 0.2 + 1/log10(nc), labRow = NULL,
labCol = NULL, main = NULL, xlab = NULL, ylab = NULL,
xside = 2, yside = 3, # 1=bottom, 2=left, 3=top, 4=right
keep.dendro = FALSE,
verbose = getOption("verbose"), ...)
{
scale <- if (symm && missing(scale))
"none"
else match.arg(scale)
if (length(di <- dim(x)) != 2 || !is.numeric(x))
stop("'x' must be a numeric matrix")
nr <- di[1L]
nc <- di[2L]
if (nr <= 1 || nc <= 1)
stop("'x' must have at least 2 rows and 2 columns")
if (!is.numeric(margins) || length(margins) != 2L)
stop("'margins' must be a numeric vector of length 2")
doRdend <- !identical(Rowv, NA)
doCdend <- !identical(Colv, NA)
if (!doRdend && identical(Colv, "Rowv"))
doCdend <- FALSE
if (is.null(Rowv))
Rowv <- rowMeans(x, na.rm = na.rm)
if (is.null(Colv))
Colv <- colMeans(x, na.rm = na.rm)
if (doRdend) {
if (inherits(Rowv, "dendrogram"))
ddr <- Rowv
else {
hcr <- hclustfun(distfun(x))
ddr <- as.dendrogram(hcr)
if (!is.logical(Rowv) || Rowv)
ddr <- reorderfun(ddr, Rowv)
}
if (nr != length(rowInd <- order.dendrogram(ddr)))
stop("row dendrogram ordering gave index of wrong length")
}
else rowInd <- 1L:nr
if (doCdend) {
if (inherits(Colv, "dendrogram"))
ddc <- Colv
else if (identical(Colv, "Rowv")) {
if (nr != nc)
stop("Colv = \"Rowv\" but nrow(x) != ncol(x)")
ddc <- ddr
}
else {
hcc <- hclustfun(distfun(if (symm)
x
else t(x)))
ddc <- as.dendrogram(hcc)
if (!is.logical(Colv) || Colv)
ddc <- reorderfun(ddc, Colv)
}
if (nc != length(colInd <- order.dendrogram(ddc)))
stop("column dendrogram ordering gave index of wrong length")
}
else colInd <- 1L:nc
x <- x[rowInd, colInd]
labRow <- labRow[rowInd] %||% rownames(x) %||% (1L:nr)[rowInd]
labCol <- labCol[colInd] %||% colnames(x) %||% (1L:nc)[colInd]
if (scale == "row") {
x <- sweep(x, 1L, rowMeans(x, na.rm = na.rm), check.margin = FALSE)
sx <- apply(x, 1L, sd, na.rm = na.rm)
x <- sweep(x, 1L, sx, "/", check.margin = FALSE)
}
else if (scale == "column") {
x <- sweep(x, 2L, colMeans(x, na.rm = na.rm), check.margin = FALSE)
sx <- apply(x, 2L, sd, na.rm = na.rm)
x <- sweep(x, 2L, sx, "/", check.margin = FALSE)
}
lmat <- rbind(c(NA, 3), 2:1)
lwid <- c(if (doRdend) 1 else 0.05, 4)
lhei <- c((if (doCdend) 1 else 0.05) + if (!is.null(main)) 0.2 else 0,
4)
if (!missing(ColSideColors)) {
if (!is.character(ColSideColors) || length(ColSideColors) !=
nc)
stop("'ColSideColors' must be a character vector of length ncol(x)")
lmat <- rbind(lmat[1, ] + 1, c(NA, 1), lmat[2, ] + 1)
lhei <- c(lhei[1L], 0.2, lhei[2L])
}
if (!missing(RowSideColors)) {
if (!is.character(RowSideColors) || length(RowSideColors) !=
nr)
stop("'RowSideColors' must be a character vector of length nrow(x)")
lmat <- cbind(lmat[, 1] + 1, c(rep(NA, nrow(lmat) - 1),
1), lmat[, 2] + 1)
lwid <- c(lwid[1L], 0.2, lwid[2L])
}
lmat[is.na(lmat)] <- 0
if (verbose) {
cat("layout: widths = ", lwid, ", heights = ", lhei,
"; lmat=\n")
print(lmat)
}
dev.hold()
on.exit(dev.flush())
op <- par(no.readonly = TRUE)
on.exit(par(op), add = TRUE)
layout(lmat, widths = lwid, heights = lhei, respect = TRUE)
if (!missing(RowSideColors)) {
par(mar = c(margins[1L], 0, 0, 0.5))
image(rbind(if (revC)
nr:1L
else 1L:nr), col = RowSideColors[rowInd], axes = FALSE)
}
if (!missing(ColSideColors)) {
par(mar = c(0.5, 0, 0, margins[2L]))
image(cbind(1L:nc), col = ColSideColors[colInd], axes = FALSE)
}
par(mar = c(margins[1L], 0, 0, margins[2L]))
if (!symm || scale != "none")
x <- t(x)
if (revC) {
iy <- nr:1
if (doRdend)
ddr <- rev(ddr)
x <- x[, iy]
}
else iy <- 1L:nr
image(1L:nc, 1L:nr, x, xlim = 0.5 + c(0, nc), ylim = 0.5 +
c(0, nr), axes = FALSE, xlab = "", ylab = "", ...)
axis(xside, 1L:nc, labels = labCol, las = 2, line = -0.5, tick = 0,
cex.axis = cexCol)
if (!is.null(xlab))
mtext(xlab, side = xside, line = margins[1L] - 1.25)
axis(yside, iy, labels = labRow, las = 2, line = -0.5, tick = 0,
cex.axis = cexRow)
if (!is.null(ylab))
mtext(ylab, side = yside, line = margins[2L] - 1.25)
if (!missing(add.expr))
eval.parent(substitute(add.expr))
par(mar = c(margins[1L], 0, 0, 0))
if (doRdend)
plot(ddr, horiz = TRUE, axes = FALSE, yaxs = "i", leaflab = "none")
else frame()
par(mar = c(0, 0, if (!is.null(main)) 1 else 0, margins[2L]))
if (doCdend)
plot(ddc, axes = FALSE, xaxs = "i", leaflab = "none")
else if (!is.null(main))
frame()
if (!is.null(main)) {
par(xpd = NA)
title(main, cex.main = 1.5 * op[["cex.main"]])
}
invisible(list(rowInd = rowInd, colInd = colInd, Rowv = if (keep.dendro &&
doRdend) ddr, Colv = if (keep.dendro && doCdend) ddc))
}
Combine the above into a new plotting function.
value_breaks <- c(0, .0001, .001, .01, .1, 1)
color_palate <- RColorBrewer::brewer.pal(length(value_breaks)-1, "Oranges")
rev_color_palate <- color_palate[length(color_palate):1]
value_colors <-
colorRampPalette(color_palate)(length(rev_color_palate))
legend_text <- c("<.0001", "<.001", "<.01", ".1", ">.1")
plot_heatmap_p <- function(wp_group = "P1",
df = jaccard_merged_df,
show_legend = TRUE) {
# Select wp_group
j_matrix <- extract_exemplar_emp_z_matrix(wp_group)
my_heatmap(
j_matrix,
Rowv = NA,
Colv = NA,
symm = TRUE,
margins = c(30,30),
col = value_colors,
breaks = value_breaks,
cexRow = 3.0,
cexCol = 3.0
)
if (show_legend) {
legend(x = "bottomright",
legend = legend_text,
fill = value_colors)
}
}
Test the heatmap plotting function.
plot_heatmap_p("P1")
plot_heatmap_p("P31M")
plot_heatmap_p("P3M1")
plot_heatmap_p("P6")
plot_heatmap_p("P6M")
Why these graphs are so different from those in exemplar-jaccard-plots.Rmd has me stumped. I’m especially confused about why the top and side margins don’t seem to behave as expeted.
I’m going to try a different approach to summarizing these. Let’s add a categorical variable to indicate \(p\) value of the empirical z.
jaccard_merged_df <- jaccard_merged_df %>%
dplyr::mutate(.,
p_z_cuts =
cut(
p_z_emp,
c(0, .0001, .001, .01, .1, 1),
labels = c("<.0001", "<.001", "<.01", "<.1", ">.1"),
include.lowest = TRUE
))
jaccard_merged_df %>%
ggplot(.) +
aes(p_z_cuts, fill = group) +
geom_bar() +
facet_grid(group ~ .)
Or in tabular form:
xtabs(~ p_z_cuts + group, jaccard_merged_df)
## group
## p_z_cuts P1 P31M P3M1 P6 P6M
## <.0001 69 68 73 68 77
## <.001 3 0 6 0 1
## <.01 1 0 2 0 1
## <.1 0 0 0 0 0
## >.1 117 122 109 122 111